library(xml2)library(rvest)library(purrr)library(dplyr)library(tibble)url<-"https://en.wikipedia.org/wiki/List_of_mathematicians_born_in_the_19th_century"ppl <-read_html(url) |>html_nodes(".mw-body-content ul li")ppl[[1]]
{html_node}
<li>
[1] <a href="/wiki/Florence_Eliza_Allen" title="Florence Eliza Allen">Florenc ...
from bs4 import BeautifulSoup, SoupStrainerimport urllib.requestimport pandas as pdurl ="https://en.wikipedia.org/wiki/List_of_mathematicians_born_in_the_19th_century"req = urllib.request.Request(url)page_bytearray = urllib.request.urlopen(req)page = page_bytearray.read()page_bytearray.close()soup = BeautifulSoup(page)ppl = soup.select(".mw-body-content ul li")ppl[0]
<li><a href="/wiki/Florence_Eliza_Allen" title="Florence Eliza Allen">Florence Eliza Allen</a> (1876–1960)</li>
Example: Mathematicians
Not all HTML nodes have the same attributes/children.
try_na <-function(i, fn, ...) { res <-try(fn(i, ...))if( "try-error"%in%class(res)) { res <-NA }if(length(res) ==0) { res <-NA } res}
try_na() will return
the value if one exists,
NA if the command results in an error
NA if the result has 0 length
def try_na(x, expression):# If x is NA, then the result must also be NA# for most HTML-parsing expressions... NOT FOOLPROOFif pd.isna(x):return pd.NAelse:try: res =eval(expression, {}, {"x": x})except:return pd.NAif res isNone: # Tests for an empty return valuereturn pd.NAiflen(res) ==0:return pd.NAreturn res
Error in xml_text(x, trim = trim) : Unexpected node type
Error in xml_text(x, trim = trim) : Unexpected node type
Error in xml_attr(x, name, default = default) : Unexpected node type
Error in xml_attr(x, name, default = default) : Unexpected node type
Error in xml_attr(x, name, default = default) : Unexpected node type
Error in xml_attr(x, name, default = default) : Unexpected node type
head(math_ppl)
# A tibble: 6 × 4
content name name2 link
<chr> <chr> <chr> <chr>
1 Florence Eliza Allen (1876–1960) Flor… Flor… /wik…
2 Emil Artin (1898–1962) Emil… Emil… /wik…
3 George David Birkhoff (1884–1944) Geor… Geor… /wik…
4 Maxime Bôcher (1867–1918) Maxi… Maxi… /wik…
5 Leonard Eugene Dickson (1874–1954), algebra and number theo… Leon… Leon… /wik…
6 Jesse Douglas (1897–1965), Fields Medalist Jess… Jess… /wik…
content = [try_na(i, "x.text") for i in ppl]link_info = [try_na(i, "x.find('a')") for i in ppl]name = [try_na(i, 'x.text') for i in link_info]name2 = [try_na(i, 'x.attrs["title"]') for i in link_info]link = [try_na(i, 'x.attrs["href"]') for i in link_info]math_ppl = pd.DataFrame({'content': content, 'name': name, 'name2': name2, 'link': link})math_ppl.head()
content ... link
0 Florence Eliza Allen (1876–1960) ... /wiki/Florence_Eliza_Allen
1 Emil Artin (1898–1962) ... /wiki/Emil_Artin
2 George David Birkhoff (1884–1944) ... /wiki/George_Birkhoff
3 Maxime Bôcher (1867–1918) ... /wiki/Maxime_B%C3%B4cher
4 Leonard Eugene Dickson (1874–1954), algebra an... ... /wiki/Leonard_Eugene_Dickson
[5 rows x 4 columns]